In this notebook, you will be introduced to some text visualizations. Just as with numerical data science, visual representation plays an important role in text analysis. We are somewhat hampered by the lack of numerical values, but there are outlets.
You will be working with the complete set of Shakespearean plays, categorized into Comedies, Tragedies, and Histories.
In this notebook, we illustrate the importance of sectioning in the reporting stage by NOT providing a table of contents with navigational aids.
You'll also notice that we didn't suggest exercises, as visualizations will pop out throughout the other notebooks.
We start by initializing our trusty companions, the tm and qdap libraries. Other libraries will join them shortly.
library('tm')
library('qdap') # for cleaning, barcharts, and word networks
First, we create corpora for the three categories of Shakespearean plays (conveniently saved in the Data/ShakespeareComedies/, Data/ShakespeareTragedies/, and Data/ShakespeareHistories/ folders).
corpus_C <- Corpus(DirSource("Data/ShakespeareComedies/"), readerControl=list(language="lat")) #load in documents
corpus_T <- Corpus(DirSource("Data/ShakespeareTragedies/"), readerControl=list(language="lat")) #load in documents
corpus_H <- Corpus(DirSource("Data/ShakespeareHistories/"), readerControl=list(language="lat")) #load in documents
summary(corpus_C)
summary(corpus_T)
summary(corpus_H)
Then we build a cleaning function for the corpora.
# A cleaning function for the corpora
clean_corpus <- function(corpus){
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stemDocument, language="english")
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords, c(stopwords("english"), c("I", "and", "the", "that", "thou", "thee", "thi")))
return(corpus)
}
clean_C = clean_corpus(corpus_C)
clean_T = clean_corpus(corpus_T)
clean_H = clean_corpus(corpus_H)
# Find the 20 most frequent terms: term_count
term_count_C <- freq_terms(clean_C,20)
term_count_T <- freq_terms(clean_T,20)
term_count_H <- freq_terms(clean_H,20)
# Plot term_count
plot(term_count_C)
plot(term_count_T)
plot(term_count_H)
We can also take a look at some basic statistics regarding the number of characters and the number of words in each play.
# Statistics on the corpora's number of characters
length_of_plays_char_C <- vector(mode="numeric", length=17)
for(j in 1:17){length_of_plays_char_C[j]=nchar(clean_C[[j]][1])}
hist(length_of_plays_char_C, freq=F, main="Distribution of # of characters in Shakespeare's Comedies")
summary(length_of_plays_char_C)
length_of_plays_char_T <- vector(mode="numeric", length=10)
for(j in 1:10){length_of_plays_char_T[j]=nchar(clean_T[[j]][1])}
hist(length_of_plays_char_T, freq=F, main="Distribution of # of characters in Shakespeare's Tragedies")
summary(length_of_plays_char_T)
length_of_plays_char_H <- vector(mode="numeric", length=10)
for(j in 1:10){length_of_plays_char_H[j]=nchar(clean_H[[j]][1])}
hist(length_of_plays_char_H, freq=F, main="Distribution of # of characters in Shakespeare's Histories")
summary(length_of_plays_char_H)
# Statistics on the recap's number of words
length_of_plays_word_C <- vector(mode="numeric", length=17)
for(j in 1:17){length_of_plays_word_C[j]=length(strsplit(gsub(' {2,}',' ',clean_C[[j]][1]),' ')[[1]])}
hist(length_of_plays_word_C, freq=F, main="Distribution of # of words in Shakespeare's Comedies")
summary(length_of_plays_word_C)
length_of_plays_word_T <- vector(mode="numeric", length=10)
for(j in 1:10){length_of_plays_word_T[j]=length(strsplit(gsub(' {2,}',' ',clean_T[[j]][1]),' ')[[1]])}
hist(length_of_plays_word_T, freq=F, main="Distribution of # of words in Shakespeare's Tragedies")
summary(length_of_plays_word_T)
length_of_plays_word_H <- vector(mode="numeric", length=10)
for(j in 1:10){length_of_plays_word_H[j]=length(strsplit(gsub(' {2,}',' ',clean_H[[j]][1]),' ')[[1]])}
hist(length_of_plays_word_H, freq=F, main="Distribution of # of words in Shakespeare's Histories")
summary(length_of_plays_word_H)
Let's create TDM matrices from clean_C, clean_T, and clean_H, and deal with sparsity.
# Create TDMs
C_tdm <- TermDocumentMatrix(clean_C)
T_tdm <- TermDocumentMatrix(clean_T)
H_tdm <- TermDocumentMatrix(clean_H)
# Print meta data
C_tdm
T_tdm
H_tdm
# Remove sparse terms
C_tdm <- removeSparseTerms(C_tdm, 0.75)
T_tdm <- removeSparseTerms(T_tdm, 0.75)
H_tdm <- removeSparseTerms(H_tdm, 0.75)
# Print meta data
C_tdm
T_tdm
H_tdm
# Convert to matrices
C_m <- as.matrix(C_tdm)
T_m <- as.matrix(T_tdm)
H_m <- as.matrix(H_tdm)
# Print the dimensions of the matrices
dim(C_m)
dim(T_m)
dim(H_m)
Now for some nice barcharts
# Calculate the rowSums: term_frequency
term_frequency_C <- rowSums(C_m)
term_frequency_T <- rowSums(T_m)
term_frequency_H <- rowSums(H_m)
# Sort term_frequency in descending order
term_frequency_C <- sort(term_frequency_C, decreasing=TRUE)
term_frequency_T <- sort(term_frequency_T, decreasing=TRUE)
term_frequency_H <- sort(term_frequency_H, decreasing=TRUE)
# View the top 20 most common words
term_frequency_C[1:20]
term_frequency_T[1:20]
term_frequency_H[1:20]
# Plot a barchart of the 20 most common words
barplot(term_frequency_C[1:20], col = "tan", las = 2)
barplot(term_frequency_T[1:20], col = "tan", las = 2)
barplot(term_frequency_H[1:20], col = "tan", las = 2)
And finally some wordclouds.
# Load wordcloud package
library('wordcloud')
# Create word_freqs
word_freqs_C = data.frame(term_frequency_C)
word_freqs_C$term = rownames(word_freqs_C)
word_freqs_C = word_freqs_C[,c(2,1)]
colnames(word_freqs_C)=c("term","num")
# Create word_freqs
word_freqs_T = data.frame(term_frequency_T)
word_freqs_T$term = rownames(word_freqs_T)
word_freqs_T = word_freqs_T[,c(2,1)]
colnames(word_freqs_T)=c("term","num")
word_freqs_H = data.frame(term_frequency_H)
word_freqs_H$term = rownames(word_freqs_H)
word_freqs_H = word_freqs_H[,c(2,1)]
colnames(word_freqs_H)=c("term","num")
# Create wordclouds
wordcloud(word_freqs_C$term, word_freqs_C$num, max.words=100, colors="red")
wordcloud(word_freqs_T$term, word_freqs_T$num, max.words=100, colors="blue")
wordcloud(word_freqs_H$term, word_freqs_H$num, max.words=100, colors="black")
To make a commonality cloud and a comparison cloud, we first create a list of all (cleaned) words in the comedies, tragedies, and histories, from clean_C, clean_T, clean_H:
clean_C
clean_T
clean_H
all_c = paste(clean_C[[1]][1],clean_C[[2]][1],clean_C[[3]][1],clean_C[[4]][1],clean_C[[5]][1],clean_C[[6]][1],clean_C[[7]][1],clean_C[[8]][1],clean_C[[9]][1],clean_C[[10]][1],clean_C[[11]][1],clean_C[[12]][1],clean_C[[13]][1],clean_C[[14]][1],clean_C[[15]][1],clean_C[[16]][1],clean_C[[17]][1],collapse=" ")
all_t = paste(clean_T[[1]][1],clean_T[[2]][1],clean_T[[3]][1],clean_T[[4]][1],clean_T[[5]][1],clean_T[[6]][1],clean_T[[7]][1],clean_T[[8]][1],clean_T[[9]][1],clean_T[[10]][1],collapse=" ")
all_h = paste(clean_H[[1]][1],clean_H[[2]][1],clean_H[[3]][1],clean_H[[4]][1],clean_H[[5]][1],clean_H[[6]][1],clean_H[[7]][1],clean_H[[8]][1],clean_H[[9]][1],clean_H[[10]][1],collapse=" ")
Join those terms as three strings:
all_ws = c(all_c,all_t,all_h)
And put it into a corpus:
all_ws = VectorSource(all_ws)
ws_corpus = VCorpus(all_ws)
ws_corpus
inspect(ws_corpus)
Now we create a TDM for this corpus, which we cast as a matrix object:
ws_tdm = TermDocumentMatrix(ws_corpus)
colnames(ws_tdm) = c("Com.","Trag.","Hist.")
ws_m = as.matrix(ws_tdm)
The commonality cloud can be printed using ... commonality.cloud:
commonality.cloud(ws_m,colors = "darkblue", max.words = 100)
commonality.cloud(ws_m,colors = "darkgreen", max.words = 200)
commonality.cloud(ws_m,colors = "darkred", max.words = 500)
And the comparison cloud using... comparison.cloud:
comparison.cloud(ws_m, colors=c("darkred","darkgreen","darkblue"), max.words = 100)
comparison.cloud(ws_m, colors=c("darkred","darkgreen","darkblue"), max.words = 200)
comparison.cloud(ws_m, colors=c("darkred","darkgreen","darkblue"), max.words = 500)
We can also do pyramid plots by first finding the terms that are common to any two corpora:
common_words_CT = subset(ws_m, ws_m[,1] > 0 & ws_m[,2] > 0)
dim(common_words_CT)
common_words_CH = subset(ws_m, ws_m[,1] > 0 & ws_m[,3] > 0)
dim(common_words_CH)
common_words_TH = subset(ws_m, ws_m[,2] > 0 & ws_m[,3] > 0)
dim(common_words_TH)
The differences in the number of times each token is used in each corpora can be computed with:
difference_CT = abs(common_words_CT[,1] - common_words_CT[,2])
difference_CH = abs(common_words_CH[,1] - common_words_CH[,3])
difference_TH = abs(common_words_TH[,2] - common_words_TH[,3])
Let's bind these new counts to the respective common_word corpora, and order them along the differences:
common_words_CT = cbind(common_words_CT,difference_CT)
common_words_CT = common_words_CT[order(common_words_CT[,4],decreasing=TRUE),]
common_words_CH = cbind(common_words_CH,difference_CH)
common_words_CH = common_words_CH[order(common_words_CH[,4],decreasing=TRUE),]
common_words_TH = cbind(common_words_TH,difference_TH)
common_words_TH = common_words_TH[order(common_words_TH[,4],decreasing=TRUE),]
Let's say we want to plot the top $n=30$ words for each pair of corpora.
n=30
top_df_CT = data.frame(x=common_words_CT[1:n,1], y=common_words_CT[1:n,2], labels=rownames(common_words_CT[1:n,]))
top_df_CH = data.frame(x=common_words_CH[1:n,1], y=common_words_CH[1:n,3], labels=rownames(common_words_CH[1:n,]))
top_df_TH = data.frame(x=common_words_TH[1:n,2], y=common_words_TH[1:n,3], labels=rownames(common_words_TH[1:n,]))
top_df_CT
top_df_CH
top_df_TH
Finally, we produce the pyramid plot themselves:
library(plotrix)
pyramid.plot(top_df_CT$x,top_df_CT$y,labels=top_df_CT$labels,
gap=500,top.labels=c("Comedies", "Terms", "Tragedies"), main="Common Terms",
laxlab=NULL, raxlab=NULL, unit=NULL)
pyramid.plot(top_df_CH$x,top_df_CH$y,labels=top_df_CH$labels,
gap=500,top.labels=c("Comedies", "Terms", "Histories"), main="Common Terms",
laxlab=NULL, raxlab=NULL, unit=NULL)
pyramid.plot(top_df_TH$x,top_df_TH$y,labels=top_df_TH$labels,
gap=500,top.labels=c("Tragedies", "Terms", "Histories"), main="Common Terms",
laxlab=NULL, raxlab=NULL, unit=NULL)
We can also try to look at word associations / phrase nets: nodes represent terms, and links represent connections between terms. First, we put all the contents of the various tragedies in a data frame:
Tragedies=rbind(clean_T[[1]]$content,clean_T[[2]]$content,clean_T[[3]]$content,clean_T[[4]]$content,clean_T[[5]]$content,clean_T[[6]]$content,clean_T[[7]]$content,clean_T[[8]]$content,clean_T[[9]]$content,clean_T[[10]]$content)
Tragedies=as.data.frame(Tragedies)
Tragedies$Tragedies=as.character(Tragedies$V1)
Let's look for associations with the word "master".
THERE'S AN ISSUE WITH ONE OF THE PACKAGES ON THE SERVER; THIS STEP COULD TAKE A FEW HOURS TO RUN. We're leaving the code as an example, but don't worry about running this part.
#word_associate(Tragedies$Tragedies,match.string=c("master"), stopwords=c(Top200Words),network.plot=TRUE,cloud.colors=c("gray70","darkblue"))
#title(main="Terms Associated with 'Master' in Shakespeare's Tragedies")